/* Copyright (c) 2004 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.net.protocols.http; import net.nutch.net.protocols.Response; import java.io.BufferedInputStream; import java.io.ByteArrayOutputStream; import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.PushbackInputStream; import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.Socket; import java.net.URL; import java.util.Map; import java.util.TreeMap; import java.util.logging.Level; import net.nutch.fetcher.FetcherConstants; import net.nutch.fetcher.FetcherStatus; import net.nutch.util.GZIPUtils; /** An HTTP response. */ public class HttpResponse implements Response { private URL url; private final Http http; private int code; private int numContinues; private Map headers; private byte[] content; private byte[] compressedContent; private MiscHttpAccounting httpAccounting; public URL getUrl() { return url; } /** Returns the response code. */ public int getCode() { return code; } /** Returns the value of a named header. */ public String getHeader(String name) { return (String)headers.get(name); } /** Returns the full content of the response. */ public byte[] getContent() { return content; } /** * Returns the compressed version of the content if the server * transmitted a compressed version, or <code>null</code> * otherwise. */ public byte[] getCompressedContent() { return compressedContent; } /** * Returns the number of 100/Continue headers encountered */ public int getNumContinues() { return numContinues; } HttpResponse(Http http, URL url) throws IOException, HttpException { this(http, url, null, null, Http.HTTP_VER_LATEST); } HttpResponse(Http http, URL url, InetAddress addr, MiscHttpAccounting httpAccounting, int httpVersion) throws IOException, HttpException { this.url = url; this.httpAccounting= httpAccounting; this.http = http; if (!"http".equals(url.getProtocol())) throw new IOException("Not an HTTP url:" + url); if ( (httpVersion < 0) || (httpVersion > Http.HTTP_VER_LATEST) ) httpVersion= Http.HTTP_VER_LATEST; if (Http.LOG.isLoggable(Level.FINE)) Http.LOG.fine("fetching " + url); String path = "".equals(url.getFile()) ? "/" : url.getFile(); // some servers will redirect a request with a host line like // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they // don't want the :80... int port; String portString; if (url.getPort() == -1) { port= 80; portString= ""; } else { port= url.getPort(); portString= ":" + port; } Socket socket = null; try { socket = new Socket(); // create the socket socket.setSoTimeout(this.http.timeout); if (addr == null) { addr=InetAddress.getByName(this.http.proxyenabled?this.http.proxyHost:url.getHost()); if (httpAccounting != null) httpAccounting.setAddr(addr); } // connect InetSocketAddress sockAddr= new InetSocketAddress(addr, this.http.proxyenabled?this.http.proxyPort:port); socket.connect(sockAddr, this.http.timeout); OutputStream req = socket.getOutputStream(); // make request StringBuffer reqStr = new StringBuffer("GET "); if(this.http.proxyenabled){ reqStr.append(url.getProtocol()).append("://").append(url.getHost()).append(portString).append(path); } else { reqStr.append(path); } if (httpVersion == Http.HTTP_VER_1_1) reqStr.append(" HTTP/1.1\r\n"); else reqStr.append(" HTTP/1.0\r\n"); reqStr.append("Host: "); reqStr.append(url.getHost()); reqStr.append(portString); reqStr.append("\r\n"); if (httpVersion == Http.HTTP_VER_1_1) { reqStr.append("Accept-Encoding: x-gzip, gzip\r\n"); reqStr.append("Connection: close\r\n"); } if ((this.http.agentString == null) || (this.http.agentString.length() == 0)) { Http.LOG.severe("User-agent is not set!"); } else { reqStr.append("User-Agent: "); reqStr.append(this.http.agentString); reqStr.append("\r\n"); } reqStr.append("\r\n"); byte[] reqBytes= reqStr.toString().getBytes(); if (httpAccounting != null) httpAccounting.incrementBytesSent(reqBytes.length); req.write(reqBytes); req.flush(); PushbackInputStream in = // process response new PushbackInputStream( new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE), Http.BUFFER_SIZE) ; StringBuffer line = new StringBuffer(); numContinues= -1; boolean haveSeenNonContinueStatus= false; while (!haveSeenNonContinueStatus) { numContinues++; // parse status code line this.code = parseStatusLine(in, line); // parse headers this.headers = parseHeaders(in, line); haveSeenNonContinueStatus= code != 100; // 100 is "Continue" } String transferCoding= getHeader("Transfer-Encoding"); if ("chunked".equals(transferCoding)) { Http.LOG.fine("fetching chunked!"); try { readChunkedContent(in, line); } catch (EOFException e) { throw new ChunkEOFException(""); } } else { Http.LOG.fine("fetching plain!"); readPlainContent(in); } String contentEncoding= getHeader("Content-Encoding"); if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { Http.LOG.fine("uncompressing...."); compressedContent= content; FetcherStatus.logTraceMisc(FetcherConstants.MISC_INFORMATIONAL, "about to decompress: " + url); content= GZIPUtils.unzipBestEffort(compressedContent, this.http.maxContentLength); if (content == null) throw new DecompressionException("unzipBestEffort returned null"); if (Http.LOG.isLoggable(Level.FINE)) Http.LOG.fine("fetched " + compressedContent.length + " bytes of compressed content (expanded to " + content.length + " bytes) from " + url); } else { if (Http.LOG.isLoggable(Level.FINE)) Http.LOG.fine("fetched " + content.length + " bytes from " + url); } } finally { if (socket != null) socket.close(); } } private void readPlainContent(InputStream in) throws HttpException, IOException { int contentLength = Integer.MAX_VALUE; // get content length String contentLengthString = (String)headers.get("Content-Length"); if (contentLengthString != null) { contentLengthString = contentLengthString.trim(); try { contentLength = Integer.parseInt(contentLengthString); } catch (NumberFormatException e) { throw new ContentLengthParseException(contentLengthString); } } if (contentLength > this.http.maxContentLength) // limit download size contentLength = this.http.maxContentLength; ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); byte[] bytes = new byte[Http.BUFFER_SIZE]; int length = 0; // read content for (int i = in.read(bytes); i != -1; i = in.read(bytes)) { if (httpAccounting != null) httpAccounting.incrementBytesRead(i); out.write(bytes, 0, i); length += i; if (length >= contentLength) break; } this.content = out.toByteArray(); } private void readChunkedContent(PushbackInputStream in, StringBuffer line) throws HttpException, IOException { boolean doneChunks= false; int contentBytesRead= 0; byte[] bytes = new byte[Http.BUFFER_SIZE]; ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); while (!doneChunks) { Http.LOG.fine("Http: starting chunk"); Http.readLine(in, line, false); if (httpAccounting != null) httpAccounting.incrementBytesRead(line.length()); String chunkLenStr; // LOG.fine("chunk-header: '" + line + "'"); int pos= line.indexOf(";"); if (pos < 0) { chunkLenStr= line.toString(); } else { chunkLenStr= line.substring(0, pos); // LOG.fine("got chunk-ext: " + line.substring(pos+1)); } chunkLenStr= chunkLenStr.trim(); int chunkLen; try { chunkLen= Integer.parseInt(chunkLenStr, 16); } catch (NumberFormatException e){ throw new ContentLengthParseException(line.toString()); } if (chunkLen == 0) { doneChunks= true; break; } if ( (contentBytesRead + chunkLen) > this.http.maxContentLength ) chunkLen= this.http.maxContentLength - contentBytesRead; // read one chunk int chunkBytesRead= 0; while (chunkBytesRead < chunkLen) { int toRead= (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ? (chunkLen - chunkBytesRead) : Http.BUFFER_SIZE; int len= in.read(bytes, 0, toRead); if (len == -1) throw new ChunkEOFException("after " + contentBytesRead + " bytes in successful chunks" + " and " + chunkBytesRead + " in current chunk"); // DANGER!!! Will printed GZIPed stuff right to your // terminal! // LOG.fine("read: " + new String(bytes, 0, len)); if (httpAccounting != null) httpAccounting.incrementBytesRead(len); out.write(bytes, 0, len); chunkBytesRead+= len; } Http.readLine(in, line, false); if (httpAccounting != null) httpAccounting.incrementBytesRead(line.length()); } if (!doneChunks) { if (contentBytesRead != this.http.maxContentLength) throw new ChunkEOFException("!doneChunk && didn't max out"); return; } this.content= out.toByteArray(); parseHeaders(in, line); } private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException { Http.readLine(in, line, false); // approximate bytes by chars- should be right for HTTP if (httpAccounting != null) httpAccounting.incrementBytesRead(line.length()); int codeStart = line.indexOf(" "); int codeEnd = line.indexOf(" ", codeStart+1); // handle lines with no plaintext result code, ie: // "HTTP/1.1 200" vs "HTTP/1.1 200 OK" if (codeEnd == -1) codeEnd= line.length(); int code; try { code= Integer.parseInt(line.substring(codeStart+1, codeEnd)); } catch (NumberFormatException e) { throw new BadStatusLineException("bad status line '" + line + "': " + e.getMessage(), e); } int versionCode= -1; int servVersionCode= Http.HTTP_VER_NOTSET; try { int httpMajorVer= 0; int httpMinorVer= 0; if (line.toString().startsWith("HTTP/")) { int dotPos= line.indexOf("."); httpMajorVer= Integer.parseInt( line.substring(5, dotPos) ); httpMinorVer= Integer.parseInt( line.substring(dotPos+1, codeStart) ); if (httpMajorVer == 1) { if (httpMinorVer < 1) versionCode= Http.HTTP_VER_1_0; else versionCode= Http.HTTP_VER_1_1; } } } catch (NumberFormatException e) { ; } if (versionCode == Http.HTTP_VER_NOTSET) // bogus, always fall back servVersionCode= Http.HTTP_VER_1_0; if (httpAccounting != null) { httpAccounting.setServHttpVersion(servVersionCode); } return code; } private void processHeaderLine(StringBuffer line, TreeMap headers) throws IOException, HttpException { int colonIndex = line.indexOf(":"); // key is up to colon if (colonIndex == -1) { int i; for (i= 0; i < line.length(); i++) if (!Character.isWhitespace(line.charAt(i))) break; if (i == line.length()) return; throw new BadHeaderLineException("No colon in header:" + line); } String key = line.substring(0, colonIndex); int valueStart = colonIndex+1; // skip whitespace while (valueStart < line.length()) { int c = line.charAt(valueStart); if (c != ' ' && c != '\t') break; valueStart++; } String value = line.substring(valueStart); headers.put(key, value); } private Map parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException { TreeMap headers = new TreeMap(String.CASE_INSENSITIVE_ORDER); return parseHeaders(in, line, headers); } // Adds headers to an existing TreeMap private Map parseHeaders(PushbackInputStream in, StringBuffer line, TreeMap headers) throws IOException, HttpException { while (Http.readLine(in, line, true) != 0) { // handle HTTP responses with missing blank line after headers int pos; if ( ((pos= line.indexOf("<!DOCTYPE")) != -1) || ((pos= line.indexOf("<HTML")) != -1) || ((pos= line.indexOf("<html")) != -1) ) { in.unread(line.substring(pos).getBytes("UTF-8")); line.setLength(pos); // approximate bytes by chars- should be right for HTTP if (httpAccounting != null) httpAccounting.incrementBytesRead(pos); try { processHeaderLine(line, headers); } catch (Exception e) { // fixme: e.printStackTrace(); } return headers; } // approximate bytes by chars- should be right for HTTP if (httpAccounting != null) httpAccounting.incrementBytesRead(line.length()); processHeaderLine(line, headers); } return headers; } }